{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Excel File\n",
    "filename = 'data/car_financing.xlsx'\n",
    "df = pd.read_excel(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Filtering \n",
    "car_filter = df['car_type']=='Toyota Sienna'\n",
    "#interest_filter = df['interest_rate']==0.0702\n",
    "car_filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.loc[car_filter, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Filtering \n",
    "car_filter = df['car_type']=='Toyota Sienna'\n",
    "interest_filter = df['interest_rate']==0.0702\n",
    "df = df.loc[car_filter & interest_filter, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Approach 1 dictionary substitution using rename method\n",
    "df = df.rename(columns={'Starting Balance': 'starting_balance',\n",
    "                        'Interest Paid': 'interest_paid', \n",
    "                        'Principal Paid': 'principal_paid',\n",
    "                        'New Balance': 'new_balance'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Approach 2 list replacement\n",
    "# Only changing Month -> month, but we need to list the rest of the columns\n",
    "df.columns = ['month',\n",
    "              'starting_balance',\n",
    "              'Repayment',\n",
    "              'interest_paid',\n",
    "              'principal_paid',\n",
    "              'new_balance',\n",
    "              'term',\n",
    "              'interest_rate',\n",
    "              'car_type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Approach 1\n",
    "# This approach allows you to drop multiple columns at a time \n",
    "df = df.drop(columns=['term'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Approach 2 use the del command\n",
    "del df['Repayment']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Removing or Filling in Missing Data\n",
    "This is an important subject as before you can graph data, you should make sure you aren't trying to graph some missing values as that can cause an error or misinterpretation of the data. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['interest_paid'].value_counts(dropna = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(df['interest_paid'].value_counts)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove Missing Values\n",
    "You can remove missing values by using the `dropna` method. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[30:40]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# You can drop entire rows if they contain 'any' nans in them or 'all'\n",
    "# this may not be the best strategy for our dataset\n",
    "df[30:40].dropna(how = 'any')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filling in Missing Values\n",
    "There are a [variety of ways to fill in missing values](https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Looking at where missing data is located\n",
    "df['interest_paid'][30:40]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filling in the nan with a zero is probably a bad idea. \n",
    "df['interest_paid'][30:40].fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# back fill in value\n",
    "df['interest_paid'][30:40].fillna(method='bfill')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# forward fill in value\n",
    "df['interest_paid'][30:40].fillna(method='ffill')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# linear interpolation (filling in of values)\n",
    "df['interest_paid'][30:40].interpolate(method = 'linear')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Interest paid before filling in the nan with a value\n",
    "df['interest_paid'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fill in with the actual value\n",
    "interest_missing = df['interest_paid'].isna()\n",
    "df.loc[interest_missing,'interest_paid'] = 93.24"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Interest paid after filling in the nan with a value\n",
    "df['interest_paid'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Notice we dont have NaN values in the DataFrame anymore\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# not null\n",
    "notNullFilter = ~df['interest_paid'].isnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.loc[notNullFilter, :]"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
